Exploratory Data Analysis 17.01.24
What are the key anthropogenic or land-related factors which most contribute to freshwater fish threatenedness globally?
Species distribution and conservation status
Catchment boundaries
Connectivity
Dams
GeoDAR (24,783 dams)
GDAT (35,140 dams)
GOODD (38,667 dams)
Land use and environmental variables
head(data_withNA)
# A tibble: 6 × 321 hybas_id continent TotalSppBasin TotalNonDDSppBasin TotalSppThreatBasin <dbl> <chr> <int> <int> <int> 1 1060000010 Africa 59 56 1 2 1060000100 Africa 81 75 2 3 1060000110 Africa 30 29 1 4 1060000150 Africa 55 54 1 5 1060000160 Africa 60 59 1 6 1060000790 Africa 64 62 NA # ℹ 316 more variables: TotalSppExtBasin <int>, ThreatenedRatio_A <dbl>, # ThreatenedRatio_B <dbl>, st_Extinct_Total <dbl>, th_Extinct_Total <dbl>, # st_Threatened_Total <dbl>, th_Threatened_Total <dbl>, CSI_max <dbl>, # CSI_min <dbl>, CSI_wMean <dbl>, CSI_mainStem <dbl>, mainStem_rivOrd <dbl>, # mainStem_reachID <dbl>, sum_reaches <dbl>, sum_riverKM <dbl>, # area_HyBASIN <dbl>, sum_dams <dbl>, dams_riverKM <dbl>, # dams_basinArea <dbl>, WISE_wMean <dbl>, OBJECTID <dbl>, NEXT_DOWN <dbl>, …
library(naniar) data_filtered <- select(data_withNA, c("hybas_id", "ThreatenedRatio_B", "ire_pc_sse", "crp_pc_sse", "ppd_pk_sav", "pop_ct_ssu", "ire_pc_use", "pst_pc_sse", "ppd_pk_uav", "pst_pc_use", "ero_kh_sav", "hft_ix_s09", "hft_ix_s93", "crp_pc_use", "gdp_ud_ssu", "ero_kh_uav", "pop_ct_usu", "hft_ix_u09", "urb_pc_sse", "gdp_ud_usu", "hft_ix_u93", "rdd_mk_sav", "urb_pc_use", "rdd_mk_uav", "nli_ix_sav", "rev_mc_usu", "nli_ix_uav", "sum_dams", "dams_riverKM", "dams_basinArea", "riv_tc_ssu", "riv_tc_usu", "pac_pc_sse", "sgr_dk_sav", "pac_pc_use", "CSI_max", "for_pc_use", "for_pc_sse", "gdp_ud_sav", "CSI_mainStem", "CSI_min", "CSI_wMean")) vis_miss(data_filtered,warn_large_data = FALSE)
Hydrosheds downloaded from https://www.hydrosheds.org/products/hydrobasins and combined with the dataset to one Shapefile.
library(dplyr) library(sf) hydrosheds<- st_read("Hydrosheds_all.shp")
Reading layer `Hydrosheds_all' from data source `/Users/justinecarey/Library/Mobile Documents/com~apple~CloudDocs/09_study/BOKU/2023-2024 WS/Exploratory Data Analysis in R/Final/Hydrosheds_all.shp' using driver `ESRI Shapefile' Simple feature collection with 16397 features and 13 fields Geometry type: MULTIPOLYGON Dimension: XY Bounding box: xmin: -180 ymin: -55.9875 xmax: 180 ymax: 83.62564 Geodetic CRS: WGS 84
joined_world <- hydrosheds %>% left_join(data_withNA, by=c("HYBAS_ID"="hybas_id"))
library(ggplot2) ggplot()+ theme_void()+ geom_sf(data=joined_world, aes(fill=TotalSppBasin), color=NA)+ scale_fill_gradient(low="khaki",high="palegreen4", na.value="grey", name="Total species")
ggplot()+ theme_void()+ geom_sf(data=joined_world, aes(fill=TotalSppThreatBasin), color=NA)+ scale_fill_gradient(low="dodgerblue2",high="red4", na.value="grey")+ labs(fill="Total threatened species")
joined_world$Extinct <- cut(joined_world$TotalSppExtBasin, breaks = c(-Inf,c(0,1,2,3),Inf), labels = c("0","1","2","3","15"),include.lowest=TRUE) ggplot()+ theme_void()+ geom_sf(data=joined_world, aes(fill=Extinct), color=NA)+ scale_fill_manual(values=c("darkolivegreen3","gold2","orange3","red4"), labels=c("1","2","3","15"))+ labs(fill="Total extinct species")
data_withNA %>% ggpairs(aes(colour = continent), columns = c(8,14,23)) %>% ggplotly(tooltip = c("x", "y", "colour")) %>% highlight("plotly_selected")
threshold_value <- 2.5 filtered_data <- data_withNA %>% filter(dams_riverKM <= threshold_value) library(dplyr) different_obs <- anti_join(data_withNA, filtered_data, by = "hybas_id") custom_colours <- c("#1f78b4", "#33a02c", "#e31a1c", "#ff7f00", "#6a3d9a","yellow") plot_ly(filtered_data, x = ~CSI_min, y = ~ThreatenedRatio_B, z = ~dams_riverKM, color = ~continent, colors = custom_colours) %>% add_markers(marker = list(size = 3))
PCA_nona <- prcomp(data_noNA[,2:41], center=TRUE, scale.=TRUE) summary(PCA_nona)
Importance of components: PC1 PC2 PC3 PC4 PC5 PC6 PC7 Standard deviation 3.1654 1.74110 1.67662 1.4792 1.42515 1.35322 1.30937 Proportion of Variance 0.2505 0.07579 0.07028 0.0547 0.05078 0.04578 0.04286 Cumulative Proportion 0.2505 0.32628 0.39656 0.4513 0.50204 0.54782 0.59068 PC8 PC9 PC10 PC11 PC12 PC13 PC14 Standard deviation 1.24077 1.1593 1.13093 1.0917 1.05912 1.03677 0.96699 Proportion of Variance 0.03849 0.0336 0.03198 0.0298 0.02804 0.02687 0.02338 Cumulative Proportion 0.62917 0.6628 0.69474 0.7245 0.75258 0.77945 0.80283 PC15 PC16 PC17 PC18 PC19 PC20 PC21 Standard deviation 0.95175 0.88344 0.83805 0.81661 0.7721 0.76822 0.61540 Proportion of Variance 0.02265 0.01951 0.01756 0.01667 0.0149 0.01475 0.00947 Cumulative Proportion 0.82547 0.84499 0.86254 0.87922 0.8941 0.90887 0.91834 PC22 PC23 PC24 PC25 PC26 PC27 PC28 Standard deviation 0.61292 0.56839 0.56071 0.53587 0.51636 0.48128 0.45242 Proportion of Variance 0.00939 0.00808 0.00786 0.00718 0.00667 0.00579 0.00512 Cumulative Proportion 0.92773 0.93581 0.94367 0.95085 0.95752 0.96331 0.96842 PC29 PC30 PC31 PC32 PC33 PC34 PC35 Standard deviation 0.42165 0.41572 0.40581 0.38230 0.3740 0.35599 0.31387 Proportion of Variance 0.00444 0.00432 0.00412 0.00365 0.0035 0.00317 0.00246 Cumulative Proportion 0.97287 0.97719 0.98131 0.98496 0.9885 0.99163 0.99409 PC36 PC37 PC38 PC39 PC40 Standard deviation 0.27995 0.24990 0.22596 0.1902 0.09194 Proportion of Variance 0.00196 0.00156 0.00128 0.0009 0.00021 Cumulative Proportion 0.99605 0.99761 0.99888 0.9998 1.00000
library(ggfortify) autoplot(PCA_nona, data=data_noNA, colour ='ThreatenedRatio_B', loadings=TRUE, loadings.colours='blue', loadings.label=TRUE, loadings.label.size =3)
library(factoextra) fviz_eig(PCA_nona)
library(corrplot) var<-get_pca_var(PCA_nona) #list of matrices containing all results for active variables corrplot(var$coord, is.corr=FALSE)
IncNodePurity ire_pc_sse 2.0440932 crp_pc_sse 1.8930350 ppd_pk_sav 5.5427870 pop_ct_ssu 4.1981079 ire_pc_use 3.5280902 pst_pc_sse 4.2877228 ppd_pk_uav 6.3457655 pst_pc_use 4.5753773 ero_kh_sav 7.5318707 hft_ix_s09 4.2841666 hft_ix_s93 4.0061011 crp_pc_use 2.6675691 gdp_ud_ssu 2.9039621 ero_kh_uav 7.9507029 pop_ct_usu 3.7881073 hft_ix_u09 3.7826659 urb_pc_sse 0.4743752 gdp_ud_usu 2.6797934 hft_ix_u93 4.2279785 rdd_mk_sav 4.5319660 urb_pc_use 0.5134280 rdd_mk_uav 4.8897086 nli_ix_sav 5.4930418 rev_mc_usu 1.3949523 nli_ix_uav 2.4441400 sum_dams 0.5359298 dams_riverKM 0.7949689 dams_basinArea 0.7624746 riv_tc_ssu 4.9694795 riv_tc_usu 4.6713837 pac_pc_sse 2.8259419 sgr_dk_sav 9.5567625 pac_pc_use 2.9776126 CSI_max 0.1276659 for_pc_use 2.9268159 for_pc_sse 2.7028816 gdp_ud_sav 15.4595145 CSI_mainStem 4.9335657 CSI_min 4.9854327 CSI_wMean 4.1357270